library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 8,
fig.height = 6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
##visualizations of US map require install of sf and maps packages
import tidy dataset:
##note: should we export tidy dataset to our directory on github so we can delete this code from any EDA?
rawdata =
read_csv("data/mental_health.csv")
## Rows: 10404 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Indicator, Group, State, Subgroup, Phase, Time Period Label, Time ...
## dbl (5): Time Period, Value, LowCI, HighCI, Suppression Flag
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidydata =
rawdata |>
janitor::clean_names() |>
mutate(indicator = str_replace(indicator, ", Last 4 Weeks", "")) |>
select(-phase, -quartile_range, -suppression_flag) |>
mutate(group = str_replace(group, "By ", "")) |>
drop_na(value) |>
select(start_date = time_period_start_date, end_date = time_period_end_date, everything())
tidydata =
tidydata |>
mutate(week_number = match(time_period_label, unique(time_period_label))) |>
mutate(
start_dates = mdy(pull(tidydata, start_date)),
end_dates = mdy(pull(tidydata, end_date))) |>
mutate(tpl = time_period_label) |>
mutate(year = as.numeric(str_extract(tpl, "\\d{4}")))|>
separate(col = time_period_label, into = c("week_label", "years"), sep = ", ", remove = FALSE, extra = "merge") |>
select(indicator, year, start_dates, end_dates, week_number, week_label = time_period_label, state, group, subgroup, value, low_ci, high_ci, confidence_interval)
This EDA will analyze mental health care access trends by state.
I will first create a dataframe that only includes state-level observations.
state_df =
tidydata |>
filter(group == "State")
Let’s start with the mental health indicator “Took Prescription Medication for Mental Health”
The following chunk shows the top five states with the highest mean percentage of respondents who reported taking prescription medication for mental health from 2020-2022.
state_meds_top5 =
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
arrange(desc(mean_value)) |>
top_n(5)
## Selecting by mean_value
state_meds_top5 |>
knitr::kable()
| state | mean_value |
|---|---|
| West Virginia | 28.84848 |
| Kentucky | 27.37879 |
| Arkansas | 26.69091 |
| Utah | 25.75455 |
| Oklahoma | 25.70909 |
Let’s visualize a heat map of this indicator’s prevalence by US state.
First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.
state_meds =
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
mutate(state = tolower(state))
usa =
st_as_sf(map("state", fill = TRUE, plot = FALSE))
us_meds =
merge(usa, state_meds, by.x = "ID", by.y = "state", all.x = TRUE)
Next, we’ll plot the data using ggplot
heatmap_meds=
ggplot(us_meds)+
geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % taking prescription medications")+
theme_minimal()+
theme(legend.position = "right",
plot.title.position = "plot")+
labs(title = "Average % used prescription medication for mental health 2020-2022, by state")
print(heatmap_meds)
We can also look at which states have had the greatest change in medication use. This shows that not only does WV have the highest prevalence of medication use for mental health during this period, but it also has had the highest level of change in this indicator between 2020-2022.
state_meds_change =
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health", year != "2021") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
arrange(state, year) |>
mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_meds_change |>
filter(!is.na(val_change)) |>
ungroup() |>
select(-year, -mean_value) |>
slice_max(order_by = val_change, n=5) |>
knitr::kable()
| state | val_change |
|---|---|
| West Virginia | 7.044444 |
| Nebraska | 5.219444 |
| North Dakota | 5.136111 |
| Minnesota | 4.502778 |
| Kentucky | 4.452778 |
Let’s take a look at the trend for this indicator over time.
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
mutate(year = factor(year)) |>
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
layout(
title = "% used prescription medication for mental health by state & year",
xaxis = list(title = "Year"),
yaxis = list(title = "Average % used prescription medication for mental health"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
Now we’ll look at the mental health indicator “Received Counseling or Therapy”
The following chunk shows the top five states with the highest mean percentage of respondents who reported receiving counseling or therapy from 2020-2022. The state with the highest percentage during this period was District of Columbia.
state_ther_top5 =
state_df |>
filter(indicator == "Received Counseling or Therapy") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
arrange(desc(mean_value)) |>
top_n(5)
## Selecting by mean_value
state_ther_top5 |>
knitr::kable()
| state | mean_value |
|---|---|
| District of Columbia | 18.08182 |
| Massachusetts | 14.92727 |
| Rhode Island | 14.24848 |
| Vermont | 13.72121 |
| Oregon | 12.97273 |
Let’s visualize a heat map of this indicator’s prevalence by US state.
First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.
state_ther =
state_df |>
filter(indicator == "Received Counseling or Therapy") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
mutate(state = tolower(state))
usa =
st_as_sf(map("state", fill = TRUE, plot = FALSE))
us_ther =
merge(usa, state_ther, by.x = "ID", by.y = "state", all.x = TRUE)
Next, we’ll plot the data using ggplot
heatmap_ther=
ggplot(us_ther)+
geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % receiving therapy")+
theme_minimal()+
theme(legend.position = "right",
plot.title.position = "plot")+
labs(title = "Average % receiving counseling or therapy 2020-2022, by state")
print(heatmap_ther)
We can also look at which states have had the greatest change in therapy use. This shows that NE had the highest level of change in this indicator between 2020-2022. It also had a high level of change in indicator 1
state_ther_change =
state_df |>
filter(indicator == "Received Counseling or Therapy", year != "2021") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
arrange(state, year) |>
mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_ther_change |>
filter(!is.na(val_change)) |>
ungroup() |>
select(-year, -mean_value) |>
slice_max(order_by = val_change, n=5) |>
knitr::kable()
| state | val_change |
|---|---|
| Nebraska | 4.369444 |
| North Dakota | 3.922222 |
| Montana | 3.694444 |
| Oklahoma | 3.091667 |
| Delaware | 2.661111 |
Let’s take a look at the trend for this indicator over time.
state_df |>
filter(indicator == "Received Counseling or Therapy") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
mutate(year = factor(year)) |>
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
layout(
title = "% received counseling or therapy by state & year",
xaxis = list(title = "Year"),
yaxis = list(title = "Average % received therapy"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
The third indicator we’ll look at combines the first two indicators into one category. This can help give a snapshot of overall mental health care need.
The following chunk shows the top five states with the highest mean percentage of respondents who reported mental health medication and/or therapy use from 2020-2022. The state with the highest percentage during this period was West Virginia.
state_both_top5 =
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
arrange(desc(mean_value)) |>
top_n(5)
## Selecting by mean_value
state_both_top5 |>
knitr::kable()
| state | mean_value |
|---|---|
| West Virginia | 30.74242 |
| Utah | 29.82424 |
| Kentucky | 29.72121 |
| Vermont | 29.68182 |
| Maine | 29.63333 |
Let’s visualize a heat map of this indicator’s prevalence by US state.
First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.
state_both =
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
mutate(state = tolower(state))
usa =
st_as_sf(map("state", fill = TRUE, plot = FALSE))
us_both =
merge(usa, state_both, by.x = "ID", by.y = "state", all.x = TRUE)
Next, we’ll plot the data using ggplot
heatmap_both=
ggplot(us_both)+
geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % medication and/or therapy use")+
theme_minimal()+
theme(legend.position = "right",
plot.title.position = "plot")+
labs(title = "Average % using medication and/or therapy 2020-2022, by state")
print(heatmap_both)
We can also look at which states have had the greatest change in mental health care use. This shows that WV had the highest level of change in this indicator between 2020-2022. This makes sense, as WV had the highest change in medication use.
state_both_change =
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy", year != "2021") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
arrange(state, year) |>
mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_both_change |>
filter(!is.na(val_change)) |>
ungroup() |>
select(-year, -mean_value) |>
slice_max(order_by = val_change, n=5) |>
knitr::kable()
| state | val_change |
|---|---|
| West Virginia | 7.155556 |
| North Dakota | 6.500000 |
| Nebraska | 5.747222 |
| Minnesota | 4.850000 |
| Oklahoma | 4.736111 |
Let’s take a look at the trend for this indicator over time.
state_df |>
filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
mutate(year = factor(year)) |>
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
layout(
title = "% medication and/or therapy use by state & year",
xaxis = list(title = "Year"),
yaxis = list(title = "Average % medication and/or therapy use"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
The last indicator we’ll look at is describes gaps in mental health care access, specifically counseling and therapy.
The following chunk shows the top five states with the highest mean percentage of respondents who reported no receipt of needed counseling/therapy from 2020-2022. The state with the highest percentage during this period was Oregon. Surprisingly, District of Columbia also appeared on the top 5, even though it was the “state” with the highest level of therapy use during this period.
state_need_top5 =
state_df |>
filter(indicator == "Needed Counseling or Therapy But Did Not Get It") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
arrange(desc(mean_value)) |>
top_n(5)
## Selecting by mean_value
state_need_top5 |>
knitr::kable()
| state | mean_value |
|---|---|
| Oregon | 14.43939 |
| Utah | 13.77879 |
| District of Columbia | 13.67273 |
| Colorado | 13.02727 |
| Washington | 12.89697 |
Let’s visualize a heat map of this indicator’s prevalence by US state.
First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.
state_need =
state_df |>
filter(indicator == "Needed Counseling or Therapy But Did Not Get It") |>
group_by(state) |>
summarize(mean_value = mean(value)) |>
mutate(state = tolower(state))
usa =
st_as_sf(map("state", fill = TRUE, plot = FALSE))
us_need =
merge(usa, state_need, by.x = "ID", by.y = "state", all.x = TRUE)
Next, we’ll plot the data using ggplot
heatmap_need=
ggplot(us_need)+
geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % did not receive needed therapy")+
theme_minimal()+
theme(legend.position = "right",
plot.title.position = "plot")+
labs(title = "Average % did not receive needed therapy 2020-2022, by state")
print(heatmap_need)
We can also look at which states have had the greatest change in unmet mental health care need. This shows that AK had the highest level of change in this indicator between 2020-2022.
state_need_change =
state_df |>
filter(indicator == "Needed Counseling or Therapy But Did Not Get It", year != "2021") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
arrange(state, year) |>
mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_need_change |>
filter(!is.na(val_change)) |>
ungroup() |>
select(-year, -mean_value) |>
slice_max(order_by = val_change, n=5) |>
knitr::kable()
| state | val_change |
|---|---|
| Arkansas | 3.127778 |
| Wyoming | 2.694444 |
| South Dakota | 2.538889 |
| Texas | 2.363889 |
| Wisconsin | 2.308333 |
Let’s take a look at the trend for this indicator over time.
state_df |>
filter(indicator == "Needed Counseling or Therapy But Did Not Get It") |>
group_by(state, year) |>
summarize(mean_value = mean(value)) |>
mutate(year = factor(year)) |>
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
layout(
title = "% did not receive needed therapy by state & year",
xaxis = list(title = "Year"),
yaxis = list(title = "Avg % did not receive needed therapy"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.